Air Quality Prediction in Busy Streets by UNStudio
!wget https://wdl-data.fra1.digitaloceanspaces.com/unstudio/stadhouderskade_air_quality_2014_to_2022.csv
!pip install catboost shap kaleido
import warnings
import matplotlib as mpl
import numpy as np
import pandas as pd
import statsmodels as sm
import tensorflow as tf
from matplotlib import pyplot as plt
from pylab import rcParams
import datetime
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa import api as smt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as offline
import plotly.graph_objs as go
import xgboost as xgb
from catboost import CatBoostClassifier
from catboost import CatBoost, Pool
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import accuracy_score
import shap
from itertools import cycle
warnings.filterwarnings("ignore")
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
plt.style.use('bmh')
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
print(tf.__version__)
import plotly.io as pio
pio.renderers.default = "svg"
df = pd.read_csv("/content/stadhouderskade_air_quality_2014_to_2022.csv", parse_dates=['timestamp_measured'], usecols=['component_id', 'value', 'timestamp_measured'])
Overview
df.head()
df.groupby("component_id").describe()
As we can see, different pollutants have different ranges and values. This will be important later on when computing the Air Quality Index. There are also some missing values (-99) which are negative, so we will also remove all negative values
Missing Values
print(f'Number of rows: {df.shape[0]}; Number of columns: {df.shape[1]}; Number of missing values: {sum(df.isna().sum())}')
print(df.isna().sum())
We observe some missing rows, but they do not contain information, so we will just remove them.
df = df.dropna()
df = df[df.value > 0]
print(f'Number of rows: {df.shape[0]}; No of missing values: {sum(df.isna().sum())}')
General information
unique_times = df['timestamp_measured'].unique()
print(f'Number of unique times: {unique_times.shape[0]}')
print(f'Earliest date: {unique_times.min()}')
print(f'Latest date: {unique_times.max()}')
Pivoting Table
Instead of having one row per pollutant and timestamp, we pivot the table and convert each pollutant into a column so we have a more comprehensive way of addressing the data
df = df.set_index('timestamp_measured')
df = df.pivot_table('value', 'timestamp_measured', 'component_id').reset_index()
df.head()
Missing Values
print(df.isna().sum())
After pivoting the table, we observe a considerable amount of NaN values. The C8H10 contains a huge percentage of NaNs.
Overview plots
rcParams['figure.figsize'] = 18, 15
values = df.values
groups = [1, 2, 3, 4, 5, 6, 7, 8]
i = 1
# plot each column
for group in groups:
plt.subplot(len(groups), 1, i)
plt.plot(values[:, group])
plt.title(df.columns[group], y=0.5, loc='right')
i += 1
plt.show()
We decide to drop the C8H10 column. We also decide to fill the remaining empty values. Given the nature of the data (timeseries), we decide to use a ffill and we carry forward the last occurrence of the dataset for any NaN value.
df = df.fillna(method='ffill')
df = df.drop(columns='C8H10')
df.sort_index(inplace = True)
df = df.dropna()
In order to understand the data a bit more and following the zoom WDL presentation, we decide to look into the seasonal components of the data.
NO2
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.NO2
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
PM2.5
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.PM25
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
We extract two important insights from these plots that can be applied to all of the pollutants.
First, the overall trend seems to be decreasing,the amount of pollutant seems to have decreased from 2015 up to today. Second, there seems to be some sort of seasonality in the data, with important peaks at the beginning of each year.
(Please see the Appendix with the remaining plots and further tests)
We noticed some seasonality in the previous stage, as well as a clear downwards trend. With this in mind, we decided to compute the Common Air Quality Index.
The CAQI is a metric used by the governments to communicate to the population how polluted the air is.
As shown in the table above, this metric is obtained by means of thresholding the values of some of the pollutants that are available in our dataset. By doing so, we obtain a value that then corresponds to one of 5 categories, ranging from 1 - Very low risk to 5 - Very High.
We will use this value to the estimate the levels of pollution present in Stadhouderskade.
(Note that each country has their own Air Quality Index. We decided to use the Common Air Quality Index as this is the European standard.)
CAQI Formulas
## PM25 CAQI calculation
def get_PM25_subindex(x):
if x <= 15:
return 1
elif x <= 30:
return 2
elif x <= 55:
return 3
elif x <= 110:
return 4
elif x > 110:
return 5
else:
return 0
def get_PM10_subindex(x):
if x <= 25:
return 1
elif x <= 50:
return 2
elif x <= 90:
return 3
elif x <= 180:
return 4
elif x > 180:
return 5
else:
return 0
def get_NOx_subindex(x):
if x <= 50:
return 1
elif x <= 100:
return 2
elif x <= 200:
return 3
elif x <= 400:
return 4
elif x > 400:
return 5
else:
return 0
def get_AQI(x):
if x == 1:
return 1 # 'Very Low'
elif x == 2:
return 2 # 'Low'
elif x == 3:
return 3 # 'Medium'
elif x == 4:
return 4 # 'High'
elif x == 5:
return 5 # 'Very High'
else:
return 0
Obtaining CAQI
Here we apply the formula to each of the 3 pollutants. We also extract the overall CAQI by selecting the maximum CAQI value from the 3 pollutants.
df["PM25_CAQI"] = df["PM25"].apply(lambda x: get_PM25_subindex(x))
df["PM10_CAQI"] = df["PM10"].apply(lambda x: get_PM10_subindex(x))
df["NO2_CAQI"] = df["NO2"].apply(lambda x: get_NOx_subindex(x))
df['CAQI'] = df[['PM25_CAQI', 'PM10_CAQI', 'NO2_CAQI']].max(axis=1).apply(lambda x: get_AQI(x))
Analysing CAQI
Extract of days with the worse CAQI values
df.loc[(df['CAQI'] == 5)]
We noticed that the worse CAQI values can be obtained in New Years Eve. This can be explained due to this street being in the middle of Amsterdam, where most of the celebrations take place. These celebrations presumably include fireworks that highly pollute the air.
It is also interesting to see that there are some individual dates that had a high pollution value. We did some research and some of this dates some of these dates correspond to massive events, i.e a dance event on the week of the 17 of October 2019, where the venue was located just at the end of this street and probably leading to a lot of traffic.
With all of this in mind, we decided to make a more comprehensive plot that gives an accurate of the CAQI measurement throughout the year
very_low = "#79bc6a"
low = "#bbcf4c"
medium = "#eec20b"
high = "#f29305"
very_high = "#e8416f"
colors = [very_low, low, medium, high, very_high]
def discrete_colorscale(bvals, colors):
"""
bvals - list of values bounding intervals/ranges of interest
colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
returns the plotly discrete colorscale
"""
if len(bvals) != len(colors)+1:
raise ValueError('len(boundary values) should be equal to len(colors)+1')
bvals = sorted(bvals)
nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals] #normalized values
dcolorscale = [] #discrete colorscale
for k in range(len(colors)):
dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
return dcolorscale
def display_year(z,
year: int = None,
month_lines: bool = True,
fig=None,
row: int = None):
if year is None:
year = datetime.datetime.now().year
data = np.ones(365) * np.nan
data[:len(z)] = z
d1 = datetime.date(year, 1, 1)
d2 = datetime.date(year, 12, 31)
delta = d2 - d1
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
month_positions = (np.cumsum(month_days) - 15)/7
dates_in_year = [d1 + datetime.timedelta(i) for i in range(delta.days+1)] #gives me a list with datetimes for each day a year
weekdays_in_year = [i.weekday() for i in dates_in_year] #gives [0,1,2,3,4,5,6,0,1,2,3,4,5,6,…] (ticktext in xaxis dict translates this to weekdays
weeknumber_of_dates = [int(i.strftime("%V")) if not (int(i.strftime("%V")) == 1 and i.month == 12) else 53
for i in dates_in_year] #gives [1,1,1,1,1,1,1,2,2,2,2,2,2,2,…] name is self-explanatory
text = [str(i) for i in dates_in_year] #gives something like list of strings like ‘2018-01-25’ for each date. Used in data trace to make good hovertext.
#4cc417 green #347c17 dark green
colorscale=discrete_colorscale([0, 1, 2, 3, 4, 5], colors)
# handle end of year
data = [
go.Heatmap(
x=weeknumber_of_dates,
y=weekdays_in_year,
z=data,
text=text,
hoverinfo='text+z',
xgap=3, # this
ygap=3, # and this is used to make the grid-like apperance
showscale=False,
colorscale=colorscale
)
]
if month_lines:
kwargs = dict(
mode='lines',
line=dict(
color='gray',
width=1
),
hoverinfo='skip'
)
for date, dow, wkn in zip(dates_in_year,
weekdays_in_year,
weeknumber_of_dates):
if date.day == 1:
data += [
go.Scatter(
x=[wkn-.5, wkn-.5],
y=[dow-.5, 6.5],
**kwargs
)
]
if dow:
data += [
go.Scatter(
x=[wkn-.5, wkn+.5],
y=[dow-.5, dow - .5],
**kwargs
),
go.Scatter(
x=[wkn+.5, wkn+.5],
y=[dow-.5, -.5],
**kwargs
)
]
layout = go.Layout(
title='CAQI in the last 3 years',
height=250,
yaxis=dict(
showline=False, showgrid=False, zeroline=False,
tickmode='array',
ticktext=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
tickvals=[0, 1, 2, 3, 4, 5, 6],
autorange="reversed"
),
xaxis=dict(
showline=False, showgrid=False, zeroline=False,
tickmode='array',
ticktext=month_names,
tickvals=month_positions
),
margin = dict(t=40),
showlegend=False
)
if fig is None:
fig = go.Figure(data=data, layout=layout)
fig.update_layout(plot_bgcolor='rgb(242,242,242)', paper_bgcolor = 'rgb(242,242,242)')
else:
fig.add_traces(data, rows=[(row+1)]*len(data), cols=[1]*len(data))
fig.update_layout(layout, plot_bgcolor='rgb(242,242,242)', paper_bgcolor = 'rgb(242,242,242)')
fig.update_xaxes(layout['xaxis'])
fig.update_yaxes(layout['yaxis'])
return fig
def display_years(z, years):
fig = make_subplots(rows=len(years), cols=1, subplot_titles=years)
for i, year in enumerate(years):
data = z[i*365 : (i+1)*365]
display_year(data, year=year, fig=fig, row=i)
fig.update_layout(height=250*len(years), width=1300)
return fig
palette = cycle(px.colors.sequential.Viridis)
df_graph = df[['timestamp_measured', 'CAQI']]
df2015 = df_graph[df_graph['timestamp_measured'].dt.year == 2015]
df2019 = df_graph[df_graph['timestamp_measured'].dt.year == 2019]
df2020 = df_graph[df_graph['timestamp_measured'].dt.year == 2020]
df2021 = df_graph[df_graph['timestamp_measured'].dt.year == 2021]
# chart
fig = make_subplots(rows=4, cols=2,
specs=[[{"type": "bar"}, {"type": "scatter"}], [{"colspan": 2}, None], [{"colspan": 2}, None], [{'type':'indicator'}, {'type':'bar'}]],
column_widths=[0.4, 0.6], vertical_spacing=0.1, horizontal_spacing=0.1,
subplot_titles=("Mean CAQI per Day of Week", "Hourly CAQI Trend", "Daily CAQI Trend", "2020 vs 2021", "Worst day: 1st of January 2019","CAQI Hourly Value Counts"))
# Upper Left chart
df_day = df_graph.groupby([df_graph["timestamp_measured"].dt.weekday]).mean().reset_index().sort_values(by='CAQI', ascending = False)
values = list(range(7))
fig.add_trace(go.Bar(x=df_day["timestamp_measured"], y=df_day['CAQI'], marker = dict(color=values, colorscale="Viridis"),
name = 'Day of Week'),
row=1, col=1)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=1, col=1)
fig.update_yaxes(showgrid = False, linecolor='gray',linewidth=2, zeroline = False, row=1, col=1)
# Upper Right chart
df_hour = df_graph.groupby([df_graph["timestamp_measured"].dt.hour]).mean().reset_index('timestamp_measured')
fig.add_trace(go.Scatter(x=df_hour["timestamp_measured"], y=df_hour['CAQI'], mode='lines+markers',
name='Hourly CAQI'), row = 1, col = 2)
# Rectangle to highlight range
fig.add_vrect(x0=5, x1=12,
fillcolor=px.colors.sequential.Viridis[4],
layer="below",
opacity=0.25,
line_width=0,
row = 1, col = 2
)
fig.add_annotation(dict(
x=7,
y=df_hour.loc[8,'CAQI']+0.004,
text="There is a <b>peak at <br>7am</b> coinciding with<br>going to work.",
ax="-20",
ay="-60",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=1, col=2)
fig.add_annotation(dict(
x=8.5,
y=1.45,
text="Morning hours are <br><b>the rush hours</b>.",
showarrow = False
), row=1, col=2)
fig.add_annotation(dict(
x=16,
y=1.39,
text="From 4pm <br> <b>CAQI<br> rises slightly again</b>.",
ax="0",
ay="-100",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=1, col=2)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=1, col=2)
fig.update_yaxes(showgrid = False, linecolor='gray', linewidth=2, row=1, col=2)
# Medium Chart
df_week = df_graph.groupby([df_graph["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week["timestamp_measured"], y = df_week['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[5]),
name='Daily CAQI'), row = 2, col = 1)
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(df_week['CAQI'], model='additive', freq = 21)
fig.add_trace(go.Scatter(x = df_week["timestamp_measured"], y = decomp.trend, mode='lines',
marker = dict(color = medium),
name='CAQI Trend'), row = 2, col = 1)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, row=2, col=1)
fig.update_yaxes(gridcolor = 'gray', gridwidth = 0.15, linecolor='gray',linewidth=2, row=2, col=1)
df_week2015= df2015.groupby([df2015["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week2015["timestamp_measured"], y = df_week2015['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[5]),
name='2015'), row = 3, col = 1)
df_week2021 = df2021.groupby([df2021["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week2021["timestamp_measured"], y = df_week2021['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[8]),
name='2021'), row = 3, col = 1)
# Left Bottom Chart
fig.add_trace(go.Indicator(mode = "number", value = 537, title = 'The first hours of New Year always have bad air quality ', number = {'prefix': "PM10 "},), row = 4, col = 1)
# Right Bottom Chart
counts = df_graph['CAQI'].value_counts(ascending=False)
fig.add_trace(go.Bar(x=counts, y=['Very Low', 'Low', 'Medium', 'High', 'Very High'], marker = dict(color=colors), name = 'CAQI Values', orientation = 'h'),
row=4, col=2)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=3, col=2)
fig.update_yaxes(showgrid = False, linecolor='gray',linewidth=2, zeroline = False, row=3, col=2)
fig.add_annotation(dict(
x=33+0.15,
y='Very High',
text="Highest CAQI <br> are <b> more unusual</b>.",
ax="110",
ay="-20",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=4, col=2)
# General Styling
fig.update_layout(height=1300, width=1300, bargap=0.2,
margin=dict(b=50,r=30,l=100),
title = "<span style='font-size:36px; font-family:Times New Roman'>CAQI Analysis</span>",
plot_bgcolor='rgb(242,242,242)',
paper_bgcolor = 'rgb(242,242,242)',
font=dict(family="Times New Roman", size= 14),
hoverlabel=dict(font_color="floralwhite"),
showlegend=False)
fig.show()
df20 = df_graph[df_graph['timestamp_measured'].dt.year >= 2019]
z = df20.groupby([df20["timestamp_measured"].dt.date])['CAQI'].mean()
display_years(z, (2019, 2020, 2021))
These plots are already really informative, but let's point out the main findings:
For this specific challenge, there is an element that we must take into account:
COVID-19
Throughout the world, the outbreak of the pandemic already lowered the amount of traffic significantly. In the specific case of the Netherlands, at the end of 2020, the Dutch government first introduced a strict lockdown throughout the entire country, following a rapid rise in COVID-19 cases. This clearly had an impact in the traffic and therefore also in the pollution levels.
In order to try to minimize the impact of this pandemic in our model, we have decided to train over the dataset from 2014-2019 and validate over 2020, excluding 2021. Most countries had restrictions in place over 2021, but the Netherlands seemed to be less proactive in this timeframe and only introduced major restrictions during late 2020-2021.
train_df = df[df["timestamp_measured"].dt.year < 2020]
test_df = df[df["timestamp_measured"].dt.year == 2020]
train_df.head()
We are setting the targets of our model to be the three pollutants that we use to compute the CAQI
targets = ['NO2', 'PM10', 'PM25', 'PM25_CAQI', 'PM10_CAQI', 'NO2_CAQI', 'CAQI']
X_train = train_df.copy()
X_train = X_train.drop(['timestamp_measured'], axis=1)
Y_train = X_train['CAQI']
X_train = X_train.drop(targets, axis=1)
X_test = test_df.copy()
X_test = X_test.drop(['timestamp_measured'], axis=1)
Y_test = X_test['CAQI']
X_test = X_test.drop(targets, axis=1)
Following the best ML practices, we will first develop a simple baseline model that we can then compare against. We will use a simple XGBoost model trained over the basic dataset.
clf = xgb.XGBClassifier()
clf.fit(X_train, Y_train)
xgb.plot_importance(clf)
preds = clf.predict(X_test)
print(accuracy_score(Y_test, preds))
train_pool = Pool(X_train, Y_train)
test_pool = Pool(X_test)
model = CatBoostClassifier()
model.fit(train_pool)
preds = model.predict(test_pool)
print(accuracy_score(Y_test, preds))
In this section, we do some feature engineering to choose the best features for our prediction model. We also experimented with many more statistical features, but they were not relevant in the end. We added that to the Appendix.
We start by adding datetime features
def add_datetime_features(df):
df['year'] = df['timestamp_measured'].dt.year
df['month'] = df['timestamp_measured'].dt.month
df['day'] = df['timestamp_measured'].dt.day
df['week'] = (df['timestamp_measured'].dt.isocalendar().week).astype("int")
df['weekday'] = df['timestamp_measured'].dt.weekday
df['weekend'] = (df['timestamp_measured'].dt.weekday >= 5).astype("int")
df['hour'] = df['timestamp_measured'].dt.hour
df['afternoon'] = (df['hour'] >= 12).astype("int")
add_datetime_features(train_df)
add_datetime_features(test_df)
Next, we add the most important features in timeseries datasets, lags. We add 7 hour lags and 7 days lags.
# Time lags
targets = ['NO2', 'PM10', 'PM25']
for t in targets:
for delta in range(1,8):
day = train_df.copy()
day['timestamp_measured'] = day['timestamp_measured'] + pd.Timedelta(delta, unit="d")
name = f'{t}_lag_{delta}'
day = day.rename(columns={t:name})[['timestamp_measured', name]]
train_df = train_df.merge(day, on=['timestamp_measured'], how='left')
for t in targets:
for delta in range(1,8):
day = train_df.copy()
day['timestamp_measured'] = day['timestamp_measured'] + pd.Timedelta(delta, unit="h")
name = f'{t}_lag_hour_{delta}'
day = day.rename(columns={t:name})[['timestamp_measured', name]]
train_df = train_df.merge(day, on=['timestamp_measured'], how='left')
targets = ['NO2', 'PM10', 'PM25']
for t in targets:
for delta in range(1,8):
day = test_df.copy()
day['timestamp_measured'] = day['timestamp_measured'] + pd.Timedelta(delta, unit="d")
name = f'{t}_lag_{delta}'
day = day.rename(columns={t:name})[['timestamp_measured', name]]
test_df = test_df.merge(day, on=['timestamp_measured'], how='left')
for t in targets:
for delta in range(1,8):
day = test_df.copy()
day['timestamp_measured'] = day['timestamp_measured'] + pd.Timedelta(delta, unit="h")
name = f'{t}_lag_hour_{delta}'
day = day.rename(columns={t:name})[['timestamp_measured', name]]
test_df = test_df.merge(day, on=['timestamp_measured'], how='left')
targets = ['NO2', 'PM10', 'PM25', 'PM25_CAQI', 'PM10_CAQI', 'NO2_CAQI', 'CAQI']
X_train = train_df.copy()
X_train = X_train.drop(['timestamp_measured'], axis=1)
Y_train = X_train['CAQI']
X_train = X_train.drop(targets, axis=1)
X_test = test_df.copy()
X_test = X_test.drop(['timestamp_measured'], axis=1)
Y_test = X_test['CAQI']
X_test = X_test.drop(targets, axis=1)
Next, we will look at the mutual information of these features to check that they are good features.
from sklearn.feature_selection import mutual_info_regression
import seaborn as sns
# Mutual Information does not support Nans, we filled them with 0
Xmi_train = X_train.fillna(0)
mi_scores = mutual_info_regression(Xmi_train[:52173], Y_train)
mi_scores = pd.Series(mi_scores, name="MI_score", index=Xmi_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)
df_mi_scores = pd.DataFrame(mi_scores).reset_index().rename(columns={'component_id':'feature'})
sns.barplot(y=df_mi_scores['feature'].loc[:20], x=df_mi_scores['MI_score'].loc[:20])
Now we model with the features we choosed in the last section
clf = xgb.XGBClassifier()
clf.fit(X_train, Y_train)
xgb.plot_importance(clf)
preds = clf.predict(X_test)
print(accuracy_score(Y_test, preds))
train_pool = Pool(X_train, Y_train)
test_pool = Pool(X_test)
model = CatBoostClassifier()
model.fit(train_pool)
preds = model.predict(test_pool)
print(accuracy_score(Y_test, preds))
Explainability of the models is important so we can understand how and why the model has made certain predictions. For this, we can obtain SHAP values.
prediction = model.predict(X_test.values)
for i in range(5):
data_for_prediction = X_test.iloc[[i]]
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(data_for_prediction.values)
shap.initjs()
plt = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction)
display(plt)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test.values)
plt = shap.summary_plot(shap_values, X_test, max_display=X_test.shape[1])
display(plt)
Copy here the most important visualizations (graphs, charts, maps, images, etc). You can refer to them in the Executive Summary.
Technical note: If not all the visualisations are visible, you can still include them as an image or link - in this case please upload them to your own repository.
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.NO2
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
palette = cycle(px.colors.sequential.Viridis)
df_graph = df[['timestamp_measured', 'CAQI']]
df2015 = df_graph[df_graph['timestamp_measured'].dt.year == 2015]
df2019 = df_graph[df_graph['timestamp_measured'].dt.year == 2019]
df2020 = df_graph[df_graph['timestamp_measured'].dt.year == 2020]
df2021 = df_graph[df_graph['timestamp_measured'].dt.year == 2021]
# chart
fig = make_subplots(rows=4, cols=2,
specs=[[{"type": "bar"}, {"type": "scatter"}], [{"colspan": 2}, None], [{"colspan": 2}, None], [{'type':'indicator'}, {'type':'bar'}]],
column_widths=[0.4, 0.6], vertical_spacing=0.1, horizontal_spacing=0.1,
subplot_titles=("Mean CAQI per Day of Week", "Hourly CAQI Trend", "Daily CAQI Trend", "2020 vs 2021", "Worst day: 1st of January 2019","CAQI Hourly Value Counts"))
# Upper Left chart
df_day = df_graph.groupby([df_graph["timestamp_measured"].dt.weekday]).mean().reset_index().sort_values(by='CAQI', ascending = False)
values = list(range(7))
fig.add_trace(go.Bar(x=df_day["timestamp_measured"], y=df_day['CAQI'], marker = dict(color=values, colorscale="Viridis"),
name = 'Day of Week'),
row=1, col=1)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=1, col=1)
fig.update_yaxes(showgrid = False, linecolor='gray',linewidth=2, zeroline = False, row=1, col=1)
# Upper Right chart
df_hour = df_graph.groupby([df_graph["timestamp_measured"].dt.hour]).mean().reset_index('timestamp_measured')
fig.add_trace(go.Scatter(x=df_hour["timestamp_measured"], y=df_hour['CAQI'], mode='lines+markers',
name='Hourly CAQI'), row = 1, col = 2)
# Rectangle to highlight range
fig.add_vrect(x0=5, x1=12,
fillcolor=px.colors.sequential.Viridis[4],
layer="below",
opacity=0.25,
line_width=0,
row = 1, col = 2
)
fig.add_annotation(dict(
x=7,
y=df_hour.loc[8,'CAQI']+0.004,
text="There is a <b>peak at <br>7am</b> coinciding with<br>going to work.",
ax="-20",
ay="-60",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=1, col=2)
fig.add_annotation(dict(
x=8.5,
y=1.45,
text="Morning hours are <br><b>the rush hours</b>.",
showarrow = False
), row=1, col=2)
fig.add_annotation(dict(
x=16,
y=1.39,
text="From 4pm <br> <b>CAQI<br> rises slightly again</b>.",
ax="0",
ay="-100",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=1, col=2)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=1, col=2)
fig.update_yaxes(showgrid = False, linecolor='gray', linewidth=2, row=1, col=2)
# Medium Chart
df_week = df_graph.groupby([df_graph["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week["timestamp_measured"], y = df_week['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[5]),
name='Daily CAQI'), row = 2, col = 1)
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(df_week['CAQI'], model='additive', freq = 21)
fig.add_trace(go.Scatter(x = df_week["timestamp_measured"], y = decomp.trend, mode='lines',
marker = dict(color = medium),
name='CAQI Trend'), row = 2, col = 1)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, row=2, col=1)
fig.update_yaxes(gridcolor = 'gray', gridwidth = 0.15, linecolor='gray',linewidth=2, row=2, col=1)
df_week2015= df2015.groupby([df2015["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week2015["timestamp_measured"], y = df_week2015['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[5]),
name='2015'), row = 3, col = 1)
df_week2021 = df2021.groupby([df2021["timestamp_measured"].dt.dayofyear]).mean().reset_index()
fig.add_trace(go.Scatter(x = df_week2021["timestamp_measured"], y = df_week2021['CAQI'], mode='lines',
marker = dict(color = px.colors.sequential.Viridis[8]),
name='2021'), row = 3, col = 1)
# Left Bottom Chart
fig.add_trace(go.Indicator(mode = "number", value = 537, title = 'The first hours of New Year always have bad air quality ', number = {'prefix': "PM10 "},), row = 4, col = 1)
# Right Bottom Chart
counts = df_graph['CAQI'].value_counts(ascending=False)
fig.add_trace(go.Bar(x=counts, y=['Very Low', 'Low', 'Medium', 'High', 'Very High'], marker = dict(color=colors), name = 'CAQI Values', orientation = 'h'),
row=4, col=2)
fig.update_xaxes(showgrid = False, linecolor='gray', linewidth = 2, zeroline = False, row=3, col=2)
fig.update_yaxes(showgrid = False, linecolor='gray',linewidth=2, zeroline = False, row=3, col=2)
fig.add_annotation(dict(
x=33+0.15,
y='Very High',
text="Highest CAQI <br> are <b> more unusual</b>.",
ax="110",
ay="-20",
showarrow = True,
arrowhead = 7,
arrowwidth = 0.7
), row=4, col=2)
# General Styling
fig.update_layout(height=1300, width=1300, bargap=0.2,
margin=dict(b=50,r=30,l=100),
title = "<span style='font-size:36px; font-family:Times New Roman'>CAQI Analysis</span>",
plot_bgcolor='rgb(242,242,242)',
paper_bgcolor = 'rgb(242,242,242)',
font=dict(family="Times New Roman", size= 14),
hoverlabel=dict(font_color="floralwhite"),
showlegend=False)
fig.show()
df20 = df_graph[df_graph['timestamp_measured'].dt.year >= 2019]
z = df20.groupby([df20["timestamp_measured"].dt.date])['CAQI'].mean()
display_years(z, (2019, 2020, 2021))
List all of the external links (even if they are already linked above), such as external datasets, papers, blog posts, code repositories and any other materials.
Dataset
Paper
Resource
def tsplot(y, lags=None, figsize=(12, 7), syle='bmh'):
if not isinstance(y, pd.Series):
y = pd.Series(y)
with plt.style.context(style='bmh'):
fig = plt.figure(figsize=(12, 7))
layout = (3, 2)
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
mean_std_ax = plt.subplot2grid(layout, (2, 0), colspan=2)
y.plot(ax=ts_ax)
p_value = sm.tsa.stattools.adfuller(y)[1]
hypothesis_result = "We reject stationarity" if p_value <= 0.05 else "We can not reject stationarity"
ts_ax.set_title(
'Time Series stationary analysis Plots\n Dickey-Fuller: p={0:.5f} Result: {1}'.format(p_value, hypothesis_result))
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
plt.tight_layout()
rolmean = y.rolling(window=8760).mean()
rolstd = y.rolling(window=8760).std()
# Plot rolling statistics:
orig = plt.plot(y, label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
tsplot(df.NO2, lags=30)
tsplot(df.NO, lags=30)
tsplot(df.FN, lags=30)
tsplot(df.C7H8, lags=30)
tsplot(df.C6H6, lags=30)
tsplot(df.PM25, lags=30)
tsplot(df.PM10, lags=30)
Plots for the remaining pollutants
def plot_variance(pca, width=8, dpi=100):
# Create figure
fig, axs = plt.subplots(2, 1)
n = pca.n_components_
grid = np.arange(1, n + 1)
# Explained variance
evr = pca.explained_variance_ratio_
axs[0].bar(grid, evr)
axs[0].set(
xlabel="Component", title="% Explained Variance", ylim=(0.0, 0.2)
)
# Cumulative Variance
cv = np.cumsum(evr)
axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
axs[1].set(
xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
)
# Set up figure
fig.set(figwidth=width, dpi=100)
return axs
X = df.copy()
X_t = df.copy()
y = X.pop('PM25')
X = X.loc[:, features]
X_t = X_t.loc[:, features]
from sklearn.decomposition import PCA
# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X)
X_t_pca = pca.transform(X_t)
# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
X_t_pca = pd.DataFrame(X_t_pca, columns=component_names)
X_pca.head(3)
loadings = pd.DataFrame(
pca.components_.T, # transpose the matrix of loadings
columns=component_names, # so the columns are the principal components
index=X.columns, # and the rows are the original features
)
loadings.head(6)
plot_variance(pca, width=6);
df_train = pd.concat([df, X_pca], axis=1)
df_test = pd.concat([df, X_t_pca], axis=1)
from catboost import CatBoostRegressor
import eli5
from eli5.sklearn import PermutationImportance
tst_start = pd.to_datetime('2016-01-01 00:00', unit="ns", utc=True)
tst_finish = pd.to_datetime('2016-12-31 23:00', unit="ns", utc=True)
valid_test = df_train[(df_train['timestamp_measured'] >= tst_start) & (df_train['timestamp_measured'] <= tst_finish)].drop(['PM25'], axis=1)
valid_train = df_train[df_train['timestamp_measured'] < tst_start]
tmp = df_train[(df_train['timestamp_measured'] >= tst_start) & (df_train['timestamp_measured'] <= tst_finish)]
valid_target = tmp.pop('PM25')
valid_sample_submission = tmp.copy()
valid_sample_submission['PM25'] = 50
valid_sample_submission = valid_sample_submission['PM25']
X_train = valid_train.copy()
X_train = X_train.drop(['timestamp_measured'], axis=1)
Y_train = X_train['PM25']
X_train = X_train.drop(['PM25'], axis=1)
X_test = valid_test.drop(['timestamp_measured'], axis=1)
model = CatBoostRegressor(logging_level='Silent', n_estimators=800, eval_metric='MAE', loss_function='MAE').fit(X_train.values, Y_train.values)
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=1).fit(X_test, valid_target)
eli5.show_weights(perm, feature_names = X_test.columns.tolist(), top=None)
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.NO
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.FN
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.C7H8
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.C6H6
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.PM25
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
rcParams['figure.figsize'] = 18, 8
plt.figure(num=None, figsize=(50, 20), dpi=80, facecolor='w', edgecolor='k')
series = df.PM10
result = seasonal_decompose(series, model='multiplicative', freq = 8760)
res = result.plot()
from catboost import CatBoostRegressor
import eli5
from eli5.sklearn import PermutationImportance
tst_start = pd.to_datetime('2016-01-01 00:00', unit="ns", utc=True)
tst_finish = pd.to_datetime('2016-12-31 23:00', unit="ns", utc=True)
res = []
for t in targets:
valid_test = df[(df['timestamp_measured'] >= tst_start) & (df['timestamp_measured'] <= tst_finish)].drop([t], axis=1)
valid_train = df[df['timestamp_measured'] < tst_start]
tmp = df[(df['timestamp_measured'] >= tst_start) & (df['timestamp_measured'] <= tst_finish)]
valid_target = tmp.pop(t)
X_train = valid_train.copy()
X_train = X_train.drop(['timestamp_measured'], axis=1)
Y_train = X_train[t]
X_train = X_train.drop([t], axis=1)
X_test = valid_test.drop(['timestamp_measured'], axis=1)
model = CatBoostRegressor(logging_level='Silent', random_state=42, eval_metric='MAE', loss_function='MAE').fit(X_train.values, Y_train.values)
perm = PermutationImportance(model, random_state=1).fit(X_test, valid_target)
display(eli5.show_weights(perm, feature_names = X_test.columns.tolist(), top=None))
We also tested some statistical features, but they were not as useful.
# Statistical features
targets = ['NO2', 'PM10', 'PM25']
for t in targets:
medians = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].median().astype(int)).reset_index()
medians = medians.rename(columns={t:f'{t}_median'})
df = df.merge(medians, on=['weekday', 'hour'], how='left')
stds = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].std().astype(int)).reset_index()
stds = stds.rename(columns={t:f'{t}_std'})
df = df.merge(stds, on=['weekday', 'hour'], how='left')
mins = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].min().astype(int)).reset_index()
mins = mins.rename(columns={t:f'{t}_min'})
df = df.merge(mins, on=['weekday', 'hour'], how='left')
maxs = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].max().astype(int)).reset_index()
maxs = maxs.rename(columns={t:f'{t}_max'})
df = df.merge(maxs, on=['weekday', 'hour'], how='left')
df_mornings = df[(df.hour >= 6) & (df.hour < 12)]
morning_avgs = pd.DataFrame(df_mornings.groupby(['month', 'day'])[t].median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={t:f'{t}_morning_avg'})
df = df.merge(morning_avgs, on=['month', 'day'], how='left')
quantile25 = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].quantile([.25]).astype(int)).reset_index()
quantile25 = quantile25.rename(columns={t:f'{t}_quantile25'}).drop(['level_2'], axis=1)
df = df.merge(quantile25, on=['weekday', 'hour'], how='left')
quantile75 = pd.DataFrame(df.groupby(['weekday', 'hour'])[t].quantile([.75]).astype(int)).reset_index()
quantile75 = quantile75.rename(columns={t:f'{t}_quantile75'}).drop(['level_2'], axis=1)
df = df.merge(quantile75, on=['weekday', 'hour'], how='left')